Importing Required Libraries¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

Importing the Dataset¶

In [2]:
# Load the fraud data
Data=pd.read_csv("C:/Users/Mus/Downloads/fraudTrain.csv")

# Show the first few rows of the dataset
print(Data.head())
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48.8878 -118.2105   
2  Sanchez      M      594 White Dale Suite 530  ...  42.1808 -112.2620   
3    White      M   9443 Cynthia Court Apt. 038  ...  46.2306 -112.1138   
4   Garcia      M              408 Bradley Rest  ...  38.4207  -79.4629   

   city_pop                                job         dob  \
0      3495          Psychologist, counselling  1988-03-09   
1       149  Special educational needs teacher  1978-06-21   
2      4154        Nature conservation officer  1962-01-19   
3      1939                    Patent attorney  1967-01-12   
4        99     Dance movement psychotherapist  1986-03-28   

                          trans_num   unix_time  merch_lat  merch_long  \
0  0b242abb623afc578575680df30655b9  1325376018  36.011293  -82.048315   
1  1f76529f8574734946361c461b024d99  1325376044  49.159047 -118.186462   
2  a1a22d70485983eac12b5b88dad1cf95  1325376051  43.150704 -112.154481   
3  6b849c168bdad6f867558c3793159a81  1325376076  47.034331 -112.561071   
4  a41d7549acf90789359a9aa5346dcb46  1325376186  38.674999  -78.632459   

   is_fraud  
0         0  
1         0  
2         0  
3         0  
4         0  

[5 rows x 23 columns]
In [3]:
# Display the count of null values in each column of the DataFrame
print(Data.isnull().sum())
Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

This analysis shows that the data has no missing values.

Descriptive statistics¶

In [4]:
# Map the is_fraud values to "No" and "Yes"
Data["fraud_label"] = Data["is_fraud"].map({0: "No", 1: "Yes"})

#Count the number of fraudulent and non-fraudulent transactions
fraud_counts = Data["fraud_label"].value_counts()

# Create the donut chart

figure = px.pie(
    values=fraud_counts.values, 
    names=fraud_counts.index, 
    hole=0.5, 
    title="Distribution of fraudulent vs non-fraudulent transactions"
)

# Afficher le diagramme
figure.show()

The data shows that 99.4% of the cases are non-fraudulent, whereas 0.6% are fraudulent.

In [5]:
# Filter the data to include only fraud cases
fraud_data = Data[Data["is_fraud"] == 1]

# Count the number of fraud cases per category
fraud_by_job = fraud_data["category"].value_counts()

# Create the bar chart
figure = px.bar(
    fraud_by_job,
    x=fraud_by_job.index,
    y=fraud_by_job.values,
    title="Number of fraud cases per Category",
    labels={'x': "Category", 'y': "Number of fraud cases"},
    color=fraud_by_job.values, 
    color_continuous_scale='Viridis' 
)

# Show the chart
figure.show()

This graph illustrates the number of fraud cases across different types of transactions or expenses. It indicates that fraud is more prevalent in grocery purchases and online shopping, while fraud cases in travel-related transactions are comparatively lower.

In [6]:
# Count the number of fraud cases per gender
fraud_by_gender = fraud_data["gender"].value_counts()

# Create the pie chart
figure = px.pie(
    fraud_by_gender,
    values=fraud_by_gender.values,
    names=fraud_by_gender.index,
    title="Distribution of fraud cases by Gender"
)

# Show the chart
figure.show()

We observed that 50.2% of the fraud cases involve men, with the remaining percentage accounted for by women.

In [7]:
# Count the number of fraud cases per state
fraud_by_state = fraud_data["state"].value_counts()

# Create the bar chart
figure = px.bar(
    fraud_by_state,
    x=fraud_by_state.index,
    y=fraud_by_state.values,
    title="Number of fraud cases per State",
    labels={'x': "State", 'y': "Number of fraud cases"},
    color=fraud_by_state.values, 
    color_continuous_scale='Blues' 
)

# Show the chart
figure.show()

The graph shows that New York (NW) has the highest number of fraud cases (555), followed by Texas (TX) with 479 cases and Pennsylvania (PA) with 458 cases. In contrast, Hawaii (HI) has the lowest number of fraud cases, totaling 7, which indicates a significantly lower incidence compared to the other states.

New York, Texas and Pennsylvania have the highest percentage of fraud cases, likely because of their large populations and high economic activity. As major economic centers with many transactions, they provide more opportunities for fraud. On the other hand, Hawaii, being less densely populated and economically isolated, naturally experiences fewer incidents of fraud.

In [8]:
# Ensure the date is in datetime format
Data['trans_date_trans_time'] = pd.to_datetime(Data['trans_date_trans_time'])

# Extract the day of the week from the transaction date
Data['day_of_week'] = Data['trans_date_trans_time'].dt.day_name()

# Count the number of fraud cases per day of the week
fraud_cases_per_day = Data[Data['is_fraud'] == 1]['day_of_week'].value_counts()

# Create a pie chart
figure = px.pie(
    fraud_cases_per_day,
    values=fraud_cases_per_day.values,
    names=fraud_cases_per_day.index,
    title="Number of fraud cases per Day of the week"
)

# Show the pie chart
figure.show()

Fraud cases are more frequent at the beginning and end of the week, with weekends showing particularly high incidence rates, followed by a decline throughout the rest of the week.

In [9]:
# Ensure the 'dob' column is in datetime format
Data['dob'] = pd.to_datetime(Data['dob'])

# Create a new 'age' column that calculates the difference between the current date and the date of birth
Data['age'] = Data['dob'].apply(lambda x: datetime.now().year - x.year - ((datetime.now().month, datetime.now().day) < (x.month, x.day)))
In [10]:
# Define the age bins
bins = [19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 99 ]

# Define the labels for these bins
labels = ['19-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69','70-74', '75-79', '80-84','85-89', '90-94', '95-99']

# Create a new column 'age_group' with the age groups
Data['age_group'] = pd.cut(Data['age'], bins=bins, labels=labels, right=True, include_lowest=True)
In [11]:
# Count the number of fraud cases by age group
fraud_by_age_group = Data[Data['is_fraud'] == 1].groupby('age_group').size()

# Plotting the results
ax = fraud_by_age_group.plot(kind='bar', color='skyblue')

for i in ax.containers:
    ax.bar_label(i)

plt.title('Number of fraud cases by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Number of fraud Cases')
plt.xticks(rotation=45)
plt.show()

Fraud cases are most frequent among individuals aged 25 to 69, indicating that both older adults and younger middle-aged individuals are at significant risk. In contrast, younger and older age groups may be less susceptible to fraud.

In [12]:
# Ensure the date is in datetime format
Data['trans_date_trans_time'] = pd.to_datetime(Data['trans_date_trans_time'])

# Extract the hour from the transaction date
Data['hour'] = Data['trans_date_trans_time'].dt.hour

# Select the relevant variables for the correlation matrix
selected_data = Data[['hour', 'city_pop', 'amt', 'age']]

# Calculate the correlation matrix
correlation_matrix = selected_data.corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

The resulting heatmap indicates that these correlations are very low, suggesting that the variables amount, city population, age and hour of transaction are not strongly related to each other.

In [13]:
# Count the number of fraud cases per hour
fraud_by_hour = Data[Data['is_fraud'] == 1].groupby('hour').size().reset_index(name='fraud_count')

# Create the line plot
plt.figure(figsize=(10, 6))
plt.plot(fraud_by_hour['hour'], fraud_by_hour['fraud_count'], marker='o', linestyle='-', color='royalblue')
plt.fill_between(fraud_by_hour['hour'], fraud_by_hour['fraud_count'], color='royalblue', alpha=0.3)

# Customize the plot
plt.title('Fraud cases by Hour of the day')
plt.xlabel('Hour of the day')
plt.ylabel('Number of fraud cases')
plt.xticks(fraud_by_hour['hour'], [f"{x:02d}h" for x in fraud_by_hour['hour']])
plt.grid(True)

# Show the plot
plt.show()

The line plot shows that fraud cases are more frequent in the evening, likely due to reduced vigilance or monitoring. After 11 PM, there's a sharp decline in fraud cases.

In [14]:
# Initialize an empty dictionary to store the p-values
p_values = {}

# Perform Chi-Square Test for each variable and store the p-value
contingency_table1 = pd.crosstab(Data['gender'], Data['is_fraud'])
chi2, p, dof, expected = chi2_contingency(contingency_table1)
p_values['gender'] = p

contingency_table2 = pd.crosstab(Data['day_of_week'], Data['is_fraud'])
chi2, p, dof, expected = chi2_contingency(contingency_table2)
p_values['day_of_week'] = p

contingency_table3 = pd.crosstab(Data['age_group'], Data['is_fraud'])
chi2, p, dof, expected = chi2_contingency(contingency_table3)
p_values['age_group'] = p

# Print all p-values
for variable, p_value in p_values.items():
    print(f"P-value for {variable}: {p_value}")
P-value for gender: 3.627211385830374e-18
P-value for day_of_week: 2.2877424359174457e-37
P-value for age_group: 3.481065950288757e-107

The results of the test indicate a statistically significant relationship between fraud and the variables : gender, age_group and day of the week.

Fraud Detection Model¶

In [15]:
print(Data)
         Unnamed: 0 trans_date_trans_time               cc_num  \
0                 0   2019-01-01 00:00:18     2703186189652095   
1                 1   2019-01-01 00:00:44         630423337322   
2                 2   2019-01-01 00:00:51       38859492057661   
3                 3   2019-01-01 00:01:16     3534093764340240   
4                 4   2019-01-01 00:03:06      375534208663984   
...             ...                   ...                  ...   
1296670     1296670   2020-06-21 12:12:08       30263540414123   
1296671     1296671   2020-06-21 12:12:19     6011149206456997   
1296672     1296672   2020-06-21 12:12:32     3514865930894695   
1296673     1296673   2020-06-21 12:13:36     2720012583106919   
1296674     1296674   2020-06-21 12:13:37  4292902571056973207   

                                    merchant       category     amt  \
0                 fraud_Rippin, Kub and Mann       misc_net    4.97   
1            fraud_Heller, Gutmann and Zieme    grocery_pos  107.23   
2                       fraud_Lind-Buckridge  entertainment  220.11   
3         fraud_Kutch, Hermiston and Farrell  gas_transport   45.00   
4                        fraud_Keeling-Crist       misc_pos   41.96   
...                                      ...            ...     ...   
1296670                    fraud_Reichel Inc  entertainment   15.56   
1296671             fraud_Abernathy and Sons    food_dining   51.70   
1296672                 fraud_Stiedemann Ltd    food_dining  105.93   
1296673  fraud_Reinger, Weissnat and Strosin    food_dining   74.90   
1296674  fraud_Langosh, Wintheiser and Hyatt    food_dining    4.30   

               first       last gender                         street  ...  \
0           Jennifer      Banks      F                 561 Perry Cove  ...   
1          Stephanie       Gill      F   43039 Riley Greens Suite 393  ...   
2             Edward    Sanchez      M       594 White Dale Suite 530  ...   
3             Jeremy      White      M    9443 Cynthia Court Apt. 038  ...   
4              Tyler     Garcia      M               408 Bradley Rest  ...   
...              ...        ...    ...                            ...  ...   
1296670         Erik  Patterson      M       162 Jessica Row Apt. 072  ...   
1296671      Jeffrey      White      M  8617 Holmes Terrace Suite 651  ...   
1296672  Christopher  Castaneda      M     1632 Cohen Drive Suite 639  ...   
1296673       Joseph     Murray      M           42933 Ryan Underpass  ...   
1296674      Jeffrey      Smith      M           135 Joseph Mountains  ...   

                                trans_num   unix_time  merch_lat  merch_long  \
0        0b242abb623afc578575680df30655b9  1325376018  36.011293  -82.048315   
1        1f76529f8574734946361c461b024d99  1325376044  49.159047 -118.186462   
2        a1a22d70485983eac12b5b88dad1cf95  1325376051  43.150704 -112.154481   
3        6b849c168bdad6f867558c3793159a81  1325376076  47.034331 -112.561071   
4        a41d7549acf90789359a9aa5346dcb46  1325376186  38.674999  -78.632459   
...                                   ...         ...        ...         ...   
1296670  440b587732da4dc1a6395aba5fb41669  1371816728  36.841266 -111.690765   
1296671  278000d2e0d2277d1de2f890067dcc0a  1371816739  38.906881  -78.246528   
1296672  483f52fe67fabef353d552c1e662974c  1371816752  33.619513 -105.130529   
1296673  d667cdcbadaaed3da3f4020e83591c83  1371816816  42.788940 -103.241160   
1296674  8f7c8e4ab7f25875d753b422917c98c9  1371816817  46.565983 -114.186110   

         is_fraud  fraud_label day_of_week age age_group  hour  
0               0           No     Tuesday  36     35-39     0  
1               0           No     Tuesday  46     45-49     0  
2               0           No     Tuesday  62     60-64     0  
3               0           No     Tuesday  57     55-59     0  
4               0           No     Tuesday  38     35-39     0  
...           ...          ...         ...  ..       ...   ...  
1296670         0           No      Sunday  62     60-64    12  
1296671         0           No      Sunday  44     40-44    12  
1296672         0           No      Sunday  56     55-59    12  
1296673         0           No      Sunday  44     40-44    12  
1296674         0           No      Sunday  29     25-29    12  

[1296675 rows x 28 columns]
In [16]:
useless_cols = [
    'Unnamed: 0',
    'merchant',
    'cc_num',
    'first',
    'zip',
    'last',
    'trans_num',
    'unix_time',
    'street',
    'merch_lat',
    'merch_long',
    'job',
    'trans_date_trans_time',
    'age_group',
    'dob',
    'lat',
    'long',
    'city',
    'fraud_label',
    'state'
    
]

# Dropping the columns
Data.drop(columns=useless_cols, inplace=True)
In [17]:
Data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   category     1296675 non-null  object 
 1   amt          1296675 non-null  float64
 2   gender       1296675 non-null  object 
 3   city_pop     1296675 non-null  int64  
 4   is_fraud     1296675 non-null  int64  
 5   day_of_week  1296675 non-null  object 
 6   age          1296675 non-null  int64  
 7   hour         1296675 non-null  int32  
dtypes: float64(1), int32(1), int64(3), object(3)
memory usage: 74.2+ MB
In [18]:
# Identify numeric columns in the dataset
numeric_columns = [i for i in Data.columns if Data[i].dtype == 'int64' or Data[i].dtype =='int32' or Data[i].dtype =='float64']

# Identify categorical columns in the dataset
categorical_columns = [i for i in Data.columns if Data[i].dtype == 'object']
In [19]:
print("Numeric Columns:\n", numeric_columns)
print("\nCategorical Columns:\n", categorical_columns)
Numeric Columns:
 ['amt', 'city_pop', 'is_fraud', 'age', 'hour']

Categorical Columns:
 ['category', 'gender', 'day_of_week']
In [20]:
# Encode categorical columns to prepare the data for modeling
Encoder = LabelEncoder()

for col in categorical_columns:
    Data[col] = Encoder.fit_transform(Data[col])

print(Data)
         category     amt  gender  city_pop  is_fraud  day_of_week  age  hour
0               8    4.97       0      3495         0            5   36     0
1               4  107.23       0       149         0            5   46     0
2               0  220.11       1      4154         0            5   62     0
3               2   45.00       1      1939         0            5   57     0
4               9   41.96       1        99         0            5   38     0
...           ...     ...     ...       ...       ...          ...  ...   ...
1296670         0   15.56       1       258         0            3   62    12
1296671         1   51.70       1       100         0            3   44    12
1296672         1  105.93       1       899         0            3   56    12
1296673         1   74.90       1      1126         0            3   44    12
1296674         1    4.30       1       218         0            3   29    12

[1296675 rows x 8 columns]
In [21]:
# Separate the dataset into non-fraud and fraud classes
non_fraud_class = Data[Data['is_fraud'] == 0]
fraud_class = Data[Data['is_fraud'] == 1]

non_fraud_count,fraud_count=Data['is_fraud'].value_counts()
In [22]:
print("Le nombre d'observations dans non_fraud_class :", non_fraud_count)
print("Le nombre d'observations dans fraud_class :", fraud_count)
Le nombre d'observations dans non_fraud_class : 1289169
Le nombre d'observations dans fraud_class : 7506

We observe that the non-fraud class is significantly overrepresented compared to the fraud class. To address this imbalance, undersampling is applied to balance the dataset before modeling.

In [23]:
# Randomly sample from the non-fraud class to match the number of fraud cases (undersampling)
non_fraud_under = non_fraud_class.sample(fraud_count)

# Combine the undersampled non-fraud cases with the fraud cases to create a balanced dataset
under_sampled = pd.concat([non_fraud_under, fraud_class], axis=0)
In [24]:
print(under_sampled)
         category      amt  gender  city_pop  is_fraud  day_of_week  age  hour
1267150         8   129.69       0      6469         0            6   45     7
69280           7    50.01       1       140         0            3   34    21
621809          6    35.00       1       798         0            2   98    19
557749          0    96.47       1     33804         0            3   33    22
715748          4   124.16       1      1762         0            2   62     5
...           ...      ...     ...       ...       ...          ...  ...   ...
1295399        11   977.01       0    105638         1            3   38     1
1295491        11  1210.91       0    105638         1            3   38     1
1295532         2    10.24       1     71335         1            3   30     2
1295666         2    21.69       0        23         1            3   54     3
1295733         2    10.20       1     71335         1            3   30     3

[15012 rows x 8 columns]
In [25]:
# Separate features and target variable from the undersampled dataset
X_under=under_sampled.drop('is_fraud',axis=1)
y_under = under_sampled['is_fraud']
In [26]:
# Get a list of column names from the undersampled dataset, excluding 'is_fraud'
columns = under_sampled.columns.tolist() 
columns.remove('is_fraud')

# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the numeric features by scaling them
X_under[columns] = scaler.fit_transform(X_under[columns])
under_sampled[X_under.columns] = X_under
under_sampled
Out[26]:
category amt gender city_pop is_fraud day_of_week age hour
1267150 0.319504 -0.457529 -0.963624 -0.275034 0 1.683560 -0.376063 -0.759402
69280 0.062683 -0.672017 1.037749 -0.294870 0 0.104124 -0.978567 0.902859
621809 -0.194139 -0.712422 1.037749 -0.292808 0 -0.422354 2.526910 0.665393
557749 -1.735068 -0.546953 1.037749 -0.189361 0 0.104124 -1.033340 1.021592
715748 -0.707782 -0.472415 1.037749 -0.289786 0 -0.422354 0.555079 -0.996868
... ... ... ... ... ... ... ... ...
1295399 1.089969 1.823340 -0.963624 0.035779 1 0.104124 -0.759475 -1.471800
1295491 1.089969 2.452967 -0.963624 0.035779 1 0.104124 -0.759475 -1.471800
1295532 -1.221425 -0.779072 1.037749 -0.071732 1 0.104124 -1.197659 -1.353067
1295666 -1.221425 -0.748250 -0.963624 -0.295237 1 0.104124 0.116894 -1.234334
1295733 -1.221425 -0.779180 1.037749 -0.071732 1 0.104124 -1.197659 -1.234334

15012 rows × 8 columns

In [61]:
# Split the undersampled dataset into training and testing sets
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under, y_under, random_state=0)
In [62]:
# Initialize the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'SVC': SVC(),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier()
}
In [63]:
# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    score_f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, score_f1
In [64]:
# Evaluate each model
results = {}
for name, model in models.items():
    accuracy, precision, recall, score_f1 = evaluate_model(model, X_train_under, y_train_under, X_test_under, y_test_under)
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': score_f1
    }
# Create a DataFrame to display the results
results_df = pd.DataFrame(results).T
results_df
C:\Users\Mus\AppData\Local\anaconda3\Lib\site-packages\sklearn\ensemble\_weight_boosting.py:527: FutureWarning:

The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning.

Out[64]:
Accuracy Precision Recall F1 Score
Logistic Regression 0.852918 0.945502 0.754322 0.839161
Decision Tree 0.965627 0.967437 0.964903 0.966168
Random Forest 0.971756 0.975726 0.968570 0.972135
Gradient Boosting 0.962963 0.970244 0.956522 0.963334
XGBoost 0.976286 0.975444 0.977999 0.976720
SVC 0.867573 0.914806 0.815610 0.862365
KNN 0.895817 0.910270 0.882137 0.895983
Naive Bayes 0.788702 0.952188 0.615506 0.747693
AdaBoost 0.937117 0.947087 0.928235 0.937566

After developing and evaluating 9 models, the XGBoost model proved to be the most effective, achieving an F1-Score of 97.6%. This indicates that XGBoost is highly accurate in identifying fraud cases. The Random Forest model was a close second, with an F1-Score of 97.2%.

Now, let's take a closer look at the XGBoost model.

In [75]:
# XGboost Model

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the model on the training data
xgb_model.fit(X_train_under, y_train_under)

# Print confusion matrix and classification report

# Predict on the test set
y_pred = xgb_model.predict(X_test_under)

conf_matrix_xgb = confusion_matrix(y_test_under, y_pred)
print('Confusion Matrix:')
print(conf_matrix)
print('\nClassification Report:')
print(classification_report(y_test_under, y_pred))
Confusion Matrix:
[[1797   47]
 [  42 1867]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1844
           1       0.98      0.98      0.98      1909

    accuracy                           0.98      3753
   macro avg       0.98      0.98      0.98      3753
weighted avg       0.98      0.98      0.98      3753

Going Further: Analyzing Additional Performance Metrics for Model Evaluation¶

To detect fraud, we can employ metrics such as total cost using a cost-sensitive matrix. By calculating the total cost, we can identify the model that minimizes overall financial and operational losses.

In [70]:
# Define the different costs
# Cost for false positives
Cost_FP = 2
# Cost for false negatives; assumes it varies based on the 'amount' column
Cost_FN = Data_sampled['amt']
# Cost for true positives
Cost_TP = 2
# Cost for true negatives
Cost_TN = 0  
In [78]:
# Define a function to compute confusion matrix
def get_confusion_matrix(model, X_test, y_test):
    y_pred = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)
    return conf_matrix

# Evaluate each model and compute the confusion matrix
confusion_matrices = {}
for name, model in models.items():
    conf_matrix = get_confusion_matrix(model, X_test_under, y_test_under)
    confusion_matrices[name] = conf_matrix

# Create a DataFrame to display confusion matrices
confusion_df = pd.DataFrame({
    'Model': list(confusion_matrices.keys()),
    'Confusion Matrix': [confusion_matrices[name] for name in confusion_matrices]
})

print(confusion_df)
                 Model            Confusion Matrix
0  Logistic Regression   [[1761, 83], [469, 1440]]
1        Decision Tree    [[1782, 62], [67, 1842]]
2        Random Forest    [[1798, 46], [60, 1849]]
3    Gradient Boosting    [[1788, 56], [83, 1826]]
4              XGBoost    [[1797, 47], [42, 1867]]
5                  SVC  [[1699, 145], [352, 1557]]
6                  KNN  [[1678, 166], [225, 1684]]
7          Naive Bayes   [[1785, 59], [734, 1175]]
8             AdaBoost   [[1745, 99], [137, 1772]]
In [79]:
# Define a function to calculate the total cost based on the confusion matrix and associated costs
def calculate_cost(conf_matrix, Cost_FP, Cost_FN, Cost_TP, Cost_TN):
    
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]
    TP = conf_matrix[1][1]
    TN = conf_matrix[0][0]
    
    # Calculate the total cost
    total_cost = FP * Cost_FP + FN * np.mean(Cost_FN) + TP * Cost_TP + FN * Cost_TN
    return total_cost
In [81]:
# Calculate and store costs for each model
costs = {}
for name, conf_matrix in confusion_matrices.items():
    total_cost = calculate_cost(conf_matrix, Cost_FP, Cost_FN, Cost_TP, Cost_TN)
    costs[name] = total_cost
In [88]:
# Create a DataFrame to display the model names and their corresponding costs
costs_df = pd.DataFrame({
    'Model': list(costs.keys()),
    'Total Cost': [round(cost, 2) for cost in costs.values()]
})

print(costs_df)
                 Model  Total Cost
0  Logistic Regression    36081.56
1        Decision Tree     8527.37
2        Random Forest     8016.30
3    Gradient Boosting     9610.38
4              XGBoost     6786.41
5                  SVC    28198.28
6                  KNN    19548.61
7          Naive Bayes    54169.70
8             AdaBoost    13392.05

The model with a high F1 Score is generally expected to have a low total cost, and this is what we have in our results, Naive Bayes has the highest total cost and XGBoost has the lowest.

We can also use Saving Score as a metric to evaluate cost improvements.

In [102]:
# Find the reference cost
# Note: In this case, the highest cost among all models is used as the reference.
reference_cost = max(costs.values())

# Calculate saving score percentage for each model
saving_score_percent = {model: ((reference_cost - cost) / reference_cost) * 100 for model, cost in costs.items()}

# Create a DataFrame to display saving score percentage
saving_score_df = pd.DataFrame({
    'Model': list(saving_score_percent.keys()),
    'Saving Score (%)': [round(saving) for saving in saving_score_percent.values()]
})

print(saving_score_df)
                 Model  Saving Score (%)
0  Logistic Regression                33
1        Decision Tree                84
2        Random Forest                85
3    Gradient Boosting                82
4              XGBoost                87
5                  SVC                48
6                  KNN                64
7          Naive Bayes                 0
8             AdaBoost                75

The XGBoost model achieves the highest saving score at 87%, followed by Random Forest and Decision Tree, with saving scores of 85% and 84%, respectively.

In summary, the XGBoost model has a high saving score, indicating that it is more effective at reducing the costs associated with fraud detection.